{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Run Ad-Hoc Data Bias Analysis\n",
    "\n",
    "## Run Bias Analysis In The Notebook using `smclarify`\n",
    "https://github.com/aws/amazon-sagemaker-clarify\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install -q smclarify==0.1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip list | grep smclarify"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    },
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "from smclarify.bias import report\n",
    "from typing import Dict\n",
    "from collections import defaultdict\n",
    "import pandas as pd\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Read Dataset From S3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%store -r bias_data_s3_uri"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(bias_data_s3_uri)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%store -r balanced_bias_data_s3_uri"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(balanced_bias_data_s3_uri)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!aws s3 cp $bias_data_s3_uri ./data-clarify/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!aws s3 cp $balanced_bias_data_s3_uri ./data-clarify/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('./data-clarify/amazon_reviews_us_giftcards_software_videogames.csv')\n",
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_balanced = pd.read_csv('./data-clarify/amazon_reviews_us_giftcards_software_videogames_balanced.csv')\n",
    "df_balanced.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Visualize Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    },
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "sns.countplot(df['star_rating'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    },
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "sns.countplot(df_balanced['star_rating'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Show Number of Reviews per Category and Star Rating"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_balanced.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_stars = df_balanced.groupby(['star_rating'], sort=False).size().reset_index(name='Count')\n",
    "print(num_stars)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "num = df_balanced.groupby(['product_category','star_rating'], sort=False).size().reset_index(name='Count')\n",
    "print(num)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Calculate Pre-Training Bias Metrics"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Define \n",
    "* Facet Column (= Product Category), \n",
    "* Label Column (= Star Rating), \n",
    "* Positive Label Value (= 5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    },
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "facet_column = report.FacetColumn(name='product_category')\n",
    "label_column = report.LabelColumn(name='star_rating', \n",
    "                                  data=df_balanced['star_rating'], \n",
    "                                  positive_label_values=[5, 4] # this doesn't matter for class-imbalance bias\n",
    "                                 )\n",
    "group_variable = df_balanced['product_category']\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Run SageMaker Clarify Bias Report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    },
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "report.bias_report(df_balanced, \n",
    "                   facet_column, \n",
    "                   label_column, \n",
    "                   stage_type=report.StageType.PRE_TRAINING, \n",
    "                   group_variable=group_variable\n",
    "                  )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# EXTRA: Create A Completely Balanced Dataset Across Categories AND Star Ratings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_giftcard5 = df_balanced.query('star_rating==5 and product_category==\"Gift Card\"')\n",
    "# df_giftcard5=df_giftcard5[:343]\n",
    "# df_giftcard5.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_giftcard4 = df_balanced.query('star_rating==4 and product_category==\"Gift Card\"')\n",
    "# df_giftcard4=df_giftcard4[:343]\n",
    "# df_giftcard4.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_giftcard3 = df_balanced.query('star_rating==3 and product_category==\"Gift Card\"')\n",
    "# df_giftcard3=df_giftcard3[:343]\n",
    "# df_giftcard3.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_giftcard2 = df_balanced.query('star_rating==2 and product_category==\"Gift Card\"')\n",
    "# df_giftcard2=df_giftcard2[:343]\n",
    "# df_giftcard2.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_giftcard1 = df_balanced.query('star_rating==1 and product_category==\"Gift Card\"')\n",
    "# df_giftcard1=df_giftcard1[:343]\n",
    "# df_giftcard1.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_videogames5 = df_balanced.query('star_rating==5 and product_category==\"Digital_Video_Games\"')\n",
    "# df_videogames5=df_videogames5[:343]\n",
    "# df_videogames5.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_videogames4 = df_balanced.query('star_rating==4 and product_category==\"Digital_Video_Games\"')\n",
    "# df_videogames4=df_videogames4[:343]\n",
    "# df_videogames4.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_videogames3 = df_balanced.query('star_rating==3 and product_category==\"Digital_Video_Games\"')\n",
    "# df_videogames3=df_videogames3[:343]\n",
    "# df_videogames3.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_videogames2 = df_balanced.query('star_rating==2 and product_category==\"Digital_Video_Games\"')\n",
    "# df_videogames2=df_videogames2[:343]\n",
    "# df_videogames2.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_videogames1 = df_balanced.query('star_rating==1 and product_category==\"Digital_Video_Games\"')\n",
    "# df_videogames1=df_videogames1[:343]\n",
    "# df_videogames1.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_software5 = df_balanced.query('star_rating==5 and product_category==\"Digital_Software\"')\n",
    "# df_software5=df_software5[:343]\n",
    "# df_software5.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_software4 = df_balanced.query('star_rating==4 and product_category==\"Digital_Software\"')\n",
    "# df_software4=df_software4[:343]\n",
    "# df_software4.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_software3 = df_balanced.query('star_rating==3 and product_category==\"Digital_Software\"')\n",
    "# df_software3=df_software3[:343]\n",
    "# df_software3.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_software2 = df_balanced.query('star_rating==2 and product_category==\"Digital_Software\"')\n",
    "# df_software2=df_software2[:343]\n",
    "# df_software2.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_software1 = df_balanced.query('star_rating==1 and product_category==\"Digital_Software\"')\n",
    "# df_software1=df_software1[:343]\n",
    "# df_software1.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generate 4th Sample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_bla5 = df_balanced.query('star_rating==5 and product_category==\"Digital_Software\"')\n",
    "# df_bla5['product_category'] = 'bla' # df_bla5['product_category'].str.replace('Digital_Software', 'bla')\n",
    "# df_bla5=df_bla5[:343]\n",
    "# df_bla5.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_bla4 = df_balanced.query('star_rating==4 and product_category==\"Digital_Software\"')\n",
    "# df_bla4['product_category'] = 'bla' # df_bla4['product_category'].str.replace('Digital_Software', 'bla')\n",
    "# df_bla4=df_bla4[:343]\n",
    "# df_bla4.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_bla3 = df_balanced.query('star_rating==3 and product_category==\"Digital_Software\"')\n",
    "# df_bla3['product_category'] = 'bla' # df_bla3['product_category'].str.replace('Digital_Software', 'bla')\n",
    "\n",
    "# df_bla3=df_bla3[:343]\n",
    "# df_bla3.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_bla2 = df_balanced.query('star_rating==2 and product_category==\"Digital_Software\"')\n",
    "# df_bla2['product_category'] = 'bla' # df_bla2['product_category'].str.replace('Digital_Software', 'bla')\n",
    "\n",
    "# df_bla2=df_bla2[:343]\n",
    "# df_bla2.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_bla1 = df_balanced.query('star_rating==1 and product_category==\"Digital_Software\"')\n",
    "# df_bla1['product_category'] = 'bla' # df_bla1['product_category'].str.replace('Digital_Software', 'bla')\n",
    "# df_bla1=df_bla1[:343]\n",
    "# df_bla1.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generate 5th Sample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_blah5 = df_balanced.query('star_rating==5 and product_category==\"Digital_Software\"')\n",
    "# df_blah5['product_category'] = 'blah' # df_bla5['product_category'].str.replace('Digital_Software', 'bla')\n",
    "# df_blah5=df_blah5[:343]\n",
    "# df_blah5.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_blah4 = df_balanced.query('star_rating==4 and product_category==\"Digital_Software\"')\n",
    "# df_blah4['product_category'] = 'blah' # df_bla4['product_category'].str.replace('Digital_Software', 'bla')\n",
    "# df_blah4=df_blah4[:343]\n",
    "# df_blah4.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_blah3 = df_balanced.query('star_rating==3 and product_category==\"Digital_Software\"')\n",
    "# df_blah3['product_category'] = 'bla' # df_bla3['product_category'].str.replace('Digital_Software', 'bla')\n",
    "\n",
    "# df_blah3=df_blah3[:343]\n",
    "# df_blah3.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_blah2 = df_balanced.query('star_rating==2 and product_category==\"Digital_Software\"')\n",
    "# df_blah2['product_category'] = 'blah' # df_bla2['product_category'].str.replace('Digital_Software', 'bla')\n",
    "\n",
    "# df_blah2=df_blah2[:343]\n",
    "# df_blah2.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_blah1 = df_balanced.query('star_rating==1 and product_category==\"Digital_Software\"')\n",
    "# df_blah1['product_category'] = 'blah' # df_bla1['product_category'].str.replace('Digital_Software', 'bla')\n",
    "# df_blah1=df_blah1[:343]\n",
    "# df_blah1.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Concatenate All Data Frames Into `Super` Balanced Data Frame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_super_balanced = pd.concat([df_giftcard5, \n",
    "# #                               df_giftcard4, \n",
    "#                                df_giftcard3, \n",
    "# #                               df_giftcard2, \n",
    "#                                df_giftcard1, \n",
    "#                                df_videogames5, \n",
    "# #                               df_videogames4, \n",
    "#                                df_videogames3, \n",
    "# #                               df_videogames2, \n",
    "#                                df_videogames1, \n",
    "#                               df_software5, \n",
    "# #                               df_software4, \n",
    "#                                df_software3, \n",
    "# #                               df_software2, \n",
    "#                               df_software1,\n",
    "# #                               df_bla5,\n",
    "# #                                df_bla4,\n",
    "# #                                df_bla3,\n",
    "# #                                df_bla2,                               \n",
    "# #                               df_bla1,\n",
    "# #                               df_blah5,\n",
    "# #                                df_blah4,\n",
    "# #                                df_blah3,\n",
    "# #                                df_blah2,                               \n",
    "# #                               df_blah1                               \n",
    "#                               ], ignore_index=True, sort=False)\n",
    "# df_super_balanced.shape"
   ]
  }
 ],
 "metadata": {
  "instance_type": "ml.t3.medium",
  "kernelspec": {
   "display_name": "Python 3 (Data Science)",
   "language": "python",
   "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:081325390199:image/datascience-1.0"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
